import librosa
import numpy as np
import matplotlib.pyplot as plt
from librosa.display import waveplot
import IPython.display as ipd
import pyworld as pw
y, sr = librosa.load('./I_have_nothing.wav')
print(f'Sampling frequency is {sr}\nNumber of samples is {len(y)}\nclip length in sec is {len(y)/sr}')
Sampling frequency is 22050 Number of samples is 375296 clip length in sec is 17.020226757369613
plt.figure(figsize=(14, 5))
figure = waveplot(y, sr=sr)
plt.show()
ipd.Audio(y, rate=sr)
ipd.Audio(y, rate=sr*1.5)
ipd.Audio(y, rate=sr*0.5)
small_y = y[0*sr:2*sr]
plt.figure(figsize=(14, 5))
figure = waveplot(small_y)
plt.show()
ipd.Audio(small_y, rate=sr)

X = librosa.stft(y)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.show()
plt.figure(figsize=(14, 5))
plt.plot(Xdb[:,20])
plt.show()
signal = small_y
_f0, t = pw.dio(signal.astype(np.double), sr) # raw pitch extractor
f0 = pw.stonemask(signal.astype(np.double), _f0, t, sr) # pitch refinement
plt.figure(figsize=(14, 10))
plt.subplot(2,1,1)
plt.plot(f0 )
plt.xlabel('time')
plt.ylabel('hz')
plt.subplot(2,1,2)
figure = waveplot(signal, sr=sr)
ipd.Audio(signal, rate=sr)
signal = small_y
# feature extraction
_f0, t = pw.dio(signal.astype(np.double), sr) # raw pitch extractor
f0 = pw.stonemask(signal.astype(np.double), _f0, t, sr) # pitch refinement
# what happens if we uncomment this
# f0 = 3*f0
# what happens if we uncomment this
# f0 = 0.6*f0
sp = pw.cheaptrick(signal.astype(np.double), f0, t, sr) # extract smoothed spectrogram
ap = pw.d4c(signal.astype(np.double), f0, t, sr) # extract aperiodicity
# resynthesis
synth_signal = pw.synthesize(f0, sp, ap, sr)
ipd.Audio(signal, rate=sr)
ipd.Audio(synth_signal, rate=sr)
# Next, we'll extract the first 13 Mel-frequency cepstral coefficients (MFCCs)
hop_length = 512 #(why?)
mfcc = librosa.feature.mfcc(y=small_y, sr=sr, hop_length=hop_length, n_mfcc=13)
print(f'mfcc shape {mfcc.shape}') # why?
# what happens if we uncomment this
# mfcc = mfcc[1:][:]
# Padding first and second deltas
delta_mfcc = librosa.feature.delta(mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# We'll show each in its own subplot
showMfcc(mfcc, delta_mfcc, delta2_mfcc)
mfcc shape (13, 87)
How can I train an LM model using these representations
How can I use it for classification
model1.fit(delta_mfcc_train)model1.score(utterance1) (Why?)There are other options based on GMM like GMM-UBM and I-vectors